a = read_csv('hotel_bookings.csv') %>%
clean_names() %>%
mutate(across(where(is.character), factor)) %>%
select(sort(tidyselect::peek_vars())) %>%
select(
where(is.Date),
where(is.factor),
where(is.numeric)
) %>% filter(is_canceled == 0) #filter to non-canceled bookings
a$is_canceled = NULL
a$reservation_status_date = NULL## Warning: `arrange_()` is deprecated as of dplyr 0.7.0.
## Please use `arrange()` instead.
## See vignette('programming') for more help
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_warnings()` to see where this warning was generated.
#removing bottom outliers
# Documentation: https://www.rdocumentation.org/packages/recipes/versions/0.1.14
#a recipe is used for preprocessing
c.recipe = c.train %>% recipe(adr ~ . ) %>%
#----------------------------
step_mutate(
arrival.date = lubridate::make_date(
arrival_date_year,
match(arrival_date_month, month.name),
arrival_date_day_of_month
)
) %>%
#enets can only use numeric data, so convert arrival.date to numeric
step_mutate(
arrival.date = arrival.date %>%
as.character %>%
stringr::str_replace_all('-','') %>%
as.numeric
) %>%
#arrival_date_month needs to be converted from factor > numeric
step_mutate(
arrival_date_month = match(arrival_date_month, month.name)
) %>%
#----------------------------
#remove vars with low or now correlation
step_corr(all_numeric(),-all_outcomes()) %>%
#remove vars with low or no variance
step_nzv(all_numeric(),-all_outcomes()) %>%
step_zv(all_numeric(),-all_outcomes()) %>%
#----------------------------
#reduce number of levels for factors with many, many levels
step_other(agent, company, country) %>% #default threshold of 0.05
#----------------------------
#create dummy vars
step_dummy(
agent, assigned_room_type, company,
country, customer_type, deposit_type,
distribution_channel, market_segment, meal,
reservation_status, reserved_room_type,
one_hot = TRUE
) %>%
#----------------------------
step_normalize(stays_in_weekend_nights, stays_in_week_nights) %>%
step_pca(stays_in_weekend_nights, stays_in_week_nights, num_comp = 1) #will limit to PC1 only
c.recipe %>% tidy## Data Recipe
##
## Inputs:
##
## role #variables
## outcome 1
## predictor 28
##
## Training data contained 16902 data points and no missing data.
##
## Operations:
##
## Variable mutation for arrival.date [trained]
## Variable mutation for arrival.date [trained]
## Variable mutation for arrival_date_month [trained]
## Correlation filter removed 2 items [trained]
## Sparse, unbalanced variable filter removed babies, children, ... [trained]
## Zero variance filter removed no terms [trained]
## Collapsing factor levels for agent, company, country [trained]
## Dummy variables from agent, assigned_room_type, company, ... [trained]
## Centering and scaling for stays_in_weekend_nights, stays_in_week_nights [trained]
## PCA extraction with stays_in_weekend_nights, stays_in_week_nights [trained]
#'Using the recipe, prep & bake the train ds'
c.baked.train = c.recipe %>% prep() %>% bake(new_data = NULL) %>%
select(sort(tidyselect::peek_vars()))
#'Using the recipe, prep & bake the test ds'
c.baked.test = c.recipe %>% prep() %>% bake(new_data = c.test) %>%
select(sort(tidyselect::peek_vars()))
c.baked.train %>% head() %>% DT::datatable()
c.baked.test %>% head %>% DT::datatable()doParallel::registerDoParallel() #use parallel processing
set.seed(345)
c.en.tg = tune_grid(
c.en.wf,
resamples = c.vfolds,
grid = 10) #Create a tuning grid AUTOMATICALLY
c.en.tgggplotly(
c.en.tg %>%
collect_metrics() %>%
filter(.metric == "rmse") %>%
select(mean, penalty, mixture) %>%
pivot_longer(penalty:mixture,
values_to = "value",
names_to = "parameter"
) %>%
ggplot(aes(value, mean, color = parameter)) +
geom_point(show.legend = FALSE, size = 3) +
facet_wrap(~parameter, scales = "free_x") +
labs(x = NULL, y = "RMSE")
)c.en.tg %>%
collect_metrics() %>%
filter(.metric == 'rmse') %>%
select(mean, penalty, mixture, .config) %>%
arrange(mean)#Easiest Option
(en.grid.xpd = expand_grid(
mixture = seq(0.40, 0.50, by = 0.03),
penalty = seq(0.25, 0.35, by = 0.02)
))
en.params <- parameters(penalty(), mixture())
(en.grid.reg = grid_regular(en.params, levels = c(5, 5)))
# creates a SFD (space filling design grid), keeps param combinations as far away from each other
(en.grid.rdm = grid_max_entropy(en.params, size = 15))
en.grid.reg %>%
ggplot(aes(x = mixture, y = penalty)) +
geom_point() +
scale_y_log10()
en.grid.rdm %>%
ggplot(aes(x = mixture, y = penalty)) +
geom_point() +
scale_y_log10()
en.grid.xpd %>%
ggplot(aes(x = mixture, y = penalty)) +
geom_point() +
scale_y_log10()c.en.wf %>%
#1) finalize wf (recipe, model w/previously unknown hps) using best hps
finalize_workflow(c.en.best.hps) %>%
#2) fit on entire train and then execute/predict on test
last_fit(c.split) %>%
#3) evaluate metrics
collect_predictions() %>%
select(.pred, adr) %>%
#'metric_set(rmse, rsq, mae)' is actually a in-line formula you create
metric_set(rmse, rsq, mae)(truth = adr, estimate = .pred)# Documentation: https://www.rdocumentation.org/packages/recipes/versions/0.1.14
#a recipe is used for preprocessing
r.recipe = r.train %>% recipe(adr ~ . ) %>%
#----------------------------
step_mutate(
arrival.date = lubridate::make_date(
arrival_date_year,
match(arrival_date_month, month.name),
arrival_date_day_of_month
)
) %>%
#enets can only use numeric data, so convert arrival.date to numeric
step_mutate(
arrival.date = arrival.date %>%
as.character %>%
stringr::str_replace_all('-','') %>%
as.numeric
) %>%
#arrival_date_month needs to be converted from factor > numeric
step_mutate(
arrival_date_month = match(arrival_date_month, month.name)
) %>%
#----------------------------
#remove vars with low or now correlation
step_corr(all_numeric(),-all_outcomes()) %>%
#remove vars with low or no variance
step_nzv(all_numeric(),-all_outcomes()) %>%
step_zv(all_numeric(),-all_outcomes()) %>%
#----------------------------
#reduce number of levels for factors with many, many levels
step_other(agent, company, country) %>% #default threshold of 0.05
#----------------------------
#create dummy vars
step_dummy(
agent, assigned_room_type, company,
country, customer_type, deposit_type,
distribution_channel, market_segment, meal,
reservation_status, reserved_room_type,
one_hot = TRUE
) %>%
#----------------------------
step_normalize(stays_in_weekend_nights, stays_in_week_nights) %>%
step_pca(stays_in_weekend_nights, stays_in_week_nights, num_comp = 1) #will limit to PC1 only
r.recipe %>% tidy## Data Recipe
##
## Inputs:
##
## role #variables
## outcome 1
## predictor 28
##
## Training data contained 10571 data points and no missing data.
##
## Operations:
##
## Variable mutation for arrival.date [trained]
## Variable mutation for arrival.date [trained]
## Variable mutation for arrival_date_month [trained]
## Correlation filter removed 2 items [trained]
## Sparse, unbalanced variable filter removed babies, children, ... [trained]
## Zero variance filter removed no terms [trained]
## Collapsing factor levels for agent, company, country [trained]
## Dummy variables from agent, assigned_room_type, company, ... [trained]
## Centering and scaling for stays_in_weekend_nights, stays_in_week_nights [trained]
## PCA extraction with stays_in_weekend_nights, stays_in_week_nights [trained]
#'Using the recipe, prep & bake the train ds'
r.baked.train = r.recipe %>% prep() %>% bake(new_data = NULL) %>%
select(sort(tidyselect::peek_vars()))
#'Using the recipe, prep & bake the test ds'
r.baked.test = r.recipe %>% prep() %>% bake(new_data = r.test) %>%
select(sort(tidyselect::peek_vars()))
r.baked.train %>% head() %>% DT::datatable()
r.baked.test %>% head %>% DT::datatable()doParallel::registerDoParallel() #use parallel processing
set.seed(345)
r.en.tg = tune_grid(
r.en.wf,
resamples = r.vfolds,
grid = 10) #Create a tuning grid AUTOMATICALLY
r.en.tgggplotly(
r.en.tg %>%
collect_metrics() %>%
filter(.metric == "rmse") %>%
select(mean, penalty, mixture) %>%
pivot_longer(penalty:mixture,
values_to = "value",
names_to = "parameter"
) %>%
ggplot(aes(value, mean, color = parameter)) +
geom_point(show.legend = FALSE, size = 3) +
facet_wrap(~parameter, scales = "free_x") +
labs(x = NULL, y = "RMSE")
)r.en.tg %>%
collect_metrics() %>%
filter(.metric == 'rmse') %>%
select(mean, penalty, mixture, .config) %>%
arrange(mean)#Easiest Option
(en.grid.xpd = expand_grid(
mixture = seq(0.40, 0.50, by = 0.03),
penalty = seq(0.25, 0.35, by = 0.02)
))
en.params <- parameters(penalty(), mixture())
(en.grid.reg = grid_regular(en.params, levels = c(5, 5)))
# creates a SFD (space filling design grid), keeps param combinations as far away from each other
(en.grid.rdm = grid_max_entropy(en.params, size = 15))
en.grid.reg %>%
ggplot(aes(x = mixture, y = penalty)) +
geom_point() +
scale_y_log10()
en.grid.rdm %>%
ggplot(aes(x = mixture, y = penalty)) +
geom_point() +
scale_y_log10()
en.grid.xpd %>%
ggplot(aes(x = mixture, y = penalty)) +
geom_point() +
scale_y_log10()r.en.wf %>%
#1) finalize wf (recipe, model w/previously unknown hps) using best hps
finalize_workflow(r.en.best.hps) %>%
#2) fit on entire train and then execute/predict on test
last_fit(r.split) %>%
#3) evaluate metrics
collect_predictions() %>%
select(.pred, adr) %>%
#'metric_set(rmse, rsq, mae)' is actually a in-line formula you create
metric_set(rmse, rsq, mae)(truth = adr, estimate = .pred)